title: “Assignment 3” author: “Group1” output: html_document —

Team member:Anusha Gunputh #Observation of missing -most data is missing in YearBuilt and Prices but does not mean data in the other variables are reliable and will show a proper indication on the effect of price which is the response.

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(readr)
library(visdat)
library(naniar)
library(forcats)
library(modelr)
library(broom)
## 
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
## 
##     bootstrap
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(dplyr)
library(viridis)
## Loading required package: viridisLite
Melbourne_housing_FULL <- read_csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   Suburb = col_character(),
##   Address = col_character(),
##   Type = col_character(),
##   Method = col_character(),
##   SellerG = col_character(),
##   Date = col_character(),
##   Distance = col_double(),
##   CouncilArea = col_character(),
##   Lattitude = col_double(),
##   Longtitude = col_double(),
##   Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col     row col          expected               actual file                    expected   <int> <chr>        <chr>                  <chr>  <chr>                   actual 1 12094 BuildingArea no trailing characters .3     '/Users/anne/ETC1010/M… file 2 12096 BuildingArea no trailing characters .33    '/Users/anne/ETC1010/M… row 3 12139 BuildingArea no trailing characters .23    '/Users/anne/ETC1010/M… col 4 12223 BuildingArea no trailing characters .51    '/Users/anne/ETC1010/M… expected 5 12252 BuildingArea no trailing characters .3     '/Users/anne/ETC1010/M…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price, Type, Rooms,YearBuilt) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(Bedroom2>0) %>% mutate(Bedroom2 = as.numeric(Bedroom2, ordered=FALSE)) 
summary_miss <- miss_summary(houses_select)
summary_miss$miss_df_prop
## [1] 0.127033
vis_miss(houses_select, sort_miss=TRUE) + theme(aspect.ratio=1)

#plotting variables with missing values- ggplot removed rows with missing values as below -bedroom vs price by type

ggplot(houses_select,
       aes(x = Bedroom2,
           y = Price)) +
  geom_point() + theme(aspect.ratio=1)+ facet_wrap(~Type)
## Warning: Removed 5833 rows containing missing values (geom_point).

#Relationship between chosen variables- Bedroom & Rooms -Looks like price tends to increase while rooms increases, they have a linear relationship

ggplot(houses_select,
       aes(x = Rooms,
           y = Price)) +
  scale_colour_brewer(palette="Dark2") +
  geom_point(alpha=0.1) + 
  geom_smooth(method="lm") +
  theme(aspect.ratio=1) + facet_wrap(~Type) +
  scale_y_log10()
## Warning: Removed 5833 rows containing non-finite values (stat_smooth).
## Warning: Removed 5833 rows containing missing values (geom_point).

ggplot(houses_select,
       aes(x = Bedroom2,
           y = Price)) +
  scale_colour_brewer(palette="Dark2") +
  geom_point(alpha=0.1) + 
  geom_smooth(method="lm") +
  theme(aspect.ratio=1) + scale_y_log10() + facet_wrap(~Type)
## Warning: Removed 5833 rows containing non-finite values (stat_smooth).

## Warning: Removed 5833 rows containing missing values (geom_point).

#Impute by type to show the missingness values in Prices -It appears most values of price is missing in the Type h than Type u and Type t. -One way to interpret the data collected is more data you collect for a particular type tend to leave rooms for invariability, hence more missingness.

Type_h <- bind_shadow(houses_select) %>% 
  filter(Type=="h") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2),
         Price = ifelse(is.na(Price), 
                             mean(Price, na.rm=TRUE),
                             Price)) 

Type_u <- bind_shadow(houses_select) %>% 
  filter(Type=="u") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2),
         Price = ifelse(is.na(Price), 
                             mean(Price, na.rm=TRUE),
                             Price)) 

Type_t <- bind_shadow(houses_select) %>% 
  filter(Type=="t") %>%
  mutate(Bedroom2 = ifelse(is.na(Bedroom2), 
                             mean(Bedroom2, na.rm=TRUE),
                             Bedroom2),
         Price = ifelse(is.na(Price), 
                             mean(Price, na.rm=TRUE),
                             Price)) 

 
Type_shadow <- bind_rows(Type_h, Type_u, Type_t)  

ggplot(Type_shadow,
       aes(x = Bedroom2,
           y = Price, 
           colour=Price_NA)) +
  geom_point(alpha=0.7) + 
  facet_grid(Type~.) + 
  scale_colour_brewer(palette="Dark2") +
  theme(aspect.ratio=0.5)

#Dependencies on other variables- YearBuilt -The plot shows year starting late 1800’s data was collected consistently, no missing data for Bedroom2, however, data is missing for prices. -Particularly; the price missing for bedroom=30 and for a house build in 2106 and has 4 bedroom. lastly house build in 1196 and has 2 bedroom. It is reasonable to take it out based on the fact that we cannot compare it with modern houses since we do not know the state of the house and how well maintained it was. Old houses tend to cost millions of dollars in melbourne. It is important to get rid of incorrect data.. Outliers like this will make our model biased.

houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price, Type, Rooms,YearBuilt) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(Bedroom2>0) %>% mutate(Bedroom2 = as.numeric(Bedroom2, ordered=FALSE))

house_shadow1 <- bind_shadow(Melbourne_housing_FULL)
ggplot(data = house_shadow1,
       aes(x = Rooms, y=YearBuilt, colour=Price_NA)) +
       scale_colour_brewer(palette="Dark2") +
       geom_point(alpha=0.7) + theme(aspect.ratio=1) + facet_wrap(~Type) 
## Warning: Removed 19306 rows containing missing values (geom_point).

ggplot(data = house_shadow1,
       aes(x = Bedroom2, y=YearBuilt, colour=Price_NA)) +
       scale_colour_brewer(palette="Dark2") +
       geom_point(alpha=0.7) + theme(aspect.ratio=1) + facet_wrap(~Type) 
## Warning: Removed 19306 rows containing missing values (geom_point).

ggplot(data = house_shadow1,
       aes(x = Bedroom2, y=Rooms, colour=Price_NA)) +
       scale_colour_brewer(palette="Dark2") +
       geom_point(alpha=0.7) + theme(aspect.ratio=1) + facet_wrap(~Type) + ylab("Room") + 
  xlab("Number of Bedroom")
## Warning: Removed 8217 rows containing missing values (geom_point).

#Boxplot- comparing medians -Looks like the median price is $910,000 for 6 bedroom and the starting price for 1 bedroom is $85,000, max price is $11,200,000 for 4 bedroom.

library(forcats)
houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price, Type, Rooms) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(Bedroom2>0) %>% filter(!is.na(Price)) %>% mutate(Bedroom2 = as.numeric(Bedroom2))
houses_select<-na.omit(houses_select)
houses_select <- rename(houses_select, Bedroom = Bedroom2)


house_box <- houses_select %>% 
  mutate(Bedroom=factor(Bedroom, 
    levels=c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10","16", "12")))
ggplot(data=house_box, aes(x=Bedroom, y=Price)) +
  geom_boxplot()

summary(houses_select)
##     Bedroom           Price          Type          Rooms       
##  Min.   : 1.000   Min.   :   85000   h:15720   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.:  660000   t: 1579   1st Qu.: 2.000  
##  Median : 3.000   Median :  910000   u: 3491   Median : 3.000  
##  Mean   : 3.049   Mean   : 1090213             Mean   : 3.061  
##  3rd Qu.: 4.000   3rd Qu.: 1335000             3rd Qu.: 4.000  
##  Max.   :20.000   Max.   :11200000             Max.   :16.000
summary(houses_select$Price)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    85000   660000   910000  1090213  1335000 11200000
##    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 85000   635000   870000  1050173  1295000 11200000     7610 

Categorical and quantitive variable

library(labelled)
Melbourne_housing_FULL <- read_csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   Suburb = col_character(),
##   Address = col_character(),
##   Type = col_character(),
##   Method = col_character(),
##   SellerG = col_character(),
##   Date = col_character(),
##   Distance = col_double(),
##   CouncilArea = col_character(),
##   Lattitude = col_double(),
##   Longtitude = col_double(),
##   Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col     row col          expected               actual file                    expected   <int> <chr>        <chr>                  <chr>  <chr>                   actual 1 12094 BuildingArea no trailing characters .3     '/Users/anne/ETC1010/M… file 2 12096 BuildingArea no trailing characters .33    '/Users/anne/ETC1010/M… row 3 12139 BuildingArea no trailing characters .23    '/Users/anne/ETC1010/M… col 4 12223 BuildingArea no trailing characters .51    '/Users/anne/ETC1010/M… expected 5 12252 BuildingArea no trailing characters .3     '/Users/anne/ETC1010/M…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price, Type, Rooms, YearBuilt) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(Bedroom2>0, Bedroom2<10) %>% filter(!is.na(Price)) %>% filter(!is.na(Rooms))%>% filter(YearBuilt>1200)

houses_select <- rename(houses_select, Bedroom = Bedroom2) %>% select(Price, Bedroom, Type, Rooms)
  
ggplot(houses_select, aes(x=Price, y=Bedroom,
                            colour=Type)) + 
  geom_point(alpha=0.3) + 
  scale_colour_brewer("Bedroom", palette="Dark2") +
  facet_wrap(~Type, nrow=3) + scale_x_log10() + geom_smooth(method="lm", se=FALSE) + 
  theme(legend.position="bottom") + 
  xlab("Price") + 
  ylab("Number of Bedroom")

#Fitting a linear model- correlation between Price and Bedroom2

houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price,Rooms, Type, YearBuilt) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(!is.na(Bedroom2))%>% filter(Bedroom2>0)  %>% mutate(Bedroom2 = as.numeric(Bedroom2, ordered=FALSE)) %>% filter(!is.na(Price)) %>% mutate(Price_log = log10(Price)) %>% mutate(Price = as.numeric(Price),ordered=FALSE) %>% filter(YearBuilt>1200)

houses_select <- rename(houses_select, Bedroom = Bedroom2)

houses_fit1 <- lm(Price_log~Bedroom + Rooms, data=houses_select)
houses_fit2 <- lm(Price_log~Bedroom*Rooms, data=houses_select)
tidy(houses_fit1)
##          term   estimate   std.error  statistic      p.value
## 1 (Intercept) 5.57329360 0.005927029 940.318269 0.000000e+00
## 2     Bedroom 0.01107983 0.007810634   1.418557 1.560539e-01
## 3       Rooms 0.11746504 0.007793065  15.073021 7.068066e-51
##Analysis: without interaction
#Bedroom:Given there is one additional bedroom in any type of house, the price increases by 0.01347383

#Rooms: Given there is one additional roomns in any type of house, the price increases by 0.10509578
tidy(houses_fit2)
##            term    estimate   std.error  statistic      p.value
## 1   (Intercept)  5.41409489 0.011625162 465.722103 0.000000e+00
## 2       Bedroom  0.06850869 0.008536603   8.025287 1.106307e-15
## 3         Rooms  0.16886411 0.008366419  20.183559 4.000222e-89
## 4 Bedroom:Rooms -0.01690853 0.001065966 -15.862160 4.264035e-56
confint_tidy(houses_fit2)
##      conf.low   conf.high
## 1  5.39130771  5.43688207
## 2  0.05177558  0.08524180
## 3  0.15246459  0.18526363
## 4 -0.01899799 -0.01481906
##Analysis: with interaction
#Bedroom*Rooms:Given there is one additional bedroom and one additional rooms in any type of house, the price decreases by 0.01288828
#Conclusion: the more rooms we add to the type of house the price starts to decrease

glance(houses_fit1)
##   r.squared adj.r.squared     sigma statistic p.value df   logLik
## 1 0.2881387     0.2880208 0.1955431  2443.787       0  3 2574.546
##         AIC       BIC deviance df.residual
## 1 -5141.092 -5111.495 461.7132       12075
glance(houses_fit2)
##   r.squared adj.r.squared    sigma statistic p.value df   logLik       AIC
## 1 0.3026702      0.302497 0.193545  1746.873       0  4 2699.098 -5388.197
##         BIC deviance df.residual
## 1 -5351.201  452.288       12074
#houses_fit2 has an interaction and is a better fit since r squared is higher than house_fit1

Visualising models

-The models are clearly different. The response is plotted against the two variable on two different level. And the interaction term makes it a much better fit as they interlaps each other while the non interaction term has paralell lines. Model 2 is definitely a better fit. -Model 1: Adding an additional bedroom to the type of house makes the price go up by a coefficient of 0.01107983 and adding an additional room will make it go up by 0.11746504. -Model 2: However, taking interaction into consideration, a house with one bedroom but adding an additional room will the price go down by a coefficient of -0.01690853 which mean even if a house has a lot of bedrooms, the fact that it has more rooms than bedroom or an equal number of rooms will make the price go down as compared to the mean price of any type.

house_mod1 <- augment(houses_fit1, houses_select)
house_mod2 <- augment(houses_fit2, houses_select)
p1 <- ggplot(house_mod1, aes(x=Bedroom, y=.fitted, colour=Rooms, group=Rooms)) +
  geom_line() +  scale_colour_viridis() + 
  theme(legend.position="none") + ggtitle("Model 1: no interaction")
p2 <- ggplot(house_mod2, aes(x=Bedroom, y=.fitted, colour=Rooms, group=Rooms)) +
  geom_line() +  scale_colour_viridis() + 
  theme(legend.position="none") + ggtitle("Model 2: with an interaction") 

p3 <- ggplot(house_mod1, aes(x=Rooms, y=.fitted, colour=Bedroom, group=Bedroom)) +
  geom_line() +  scale_colour_viridis() + 
  theme(legend.position="none") + ggtitle("Model 1: no interaction")
p4 <- ggplot(house_mod2, aes(x=Rooms, y=.fitted, colour=Bedroom, group=Bedroom)) +
  geom_line() +  scale_colour_viridis() + 
  theme(legend.position="none") + ggtitle("Model 2: with an interaction") 
grid.arrange(p1, p2, p3, p4, ncol=2)

#Residuals vs Fitted
p1 <- ggplot(house_mod2, aes(x=.fitted, y=.std.resid)) +
  geom_point() + ggtitle("Model 1")
p2 <- ggplot(house_mod2, aes(x=.fitted, y=.std.resid)) +
  geom_point() + ggtitle("Model 2")
grid.arrange(p1, p2, ncol=2)

#The residuals and fitted is the same since we took out the missing values.

Team member:Fengqiu Fang

find the price missing value and add new columns that are functions of existing columns-Variables used room,Price,Rooms,Type

room<-read_csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   Suburb = col_character(),
##   Address = col_character(),
##   Type = col_character(),
##   Method = col_character(),
##   SellerG = col_character(),
##   Date = col_character(),
##   Distance = col_double(),
##   CouncilArea = col_character(),
##   Lattitude = col_double(),
##   Longtitude = col_double(),
##   Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col     row col          expected               actual file                    expected   <int> <chr>        <chr>                  <chr>  <chr>                   actual 1 12094 BuildingArea no trailing characters .3     '/Users/anne/ETC1010/M… file 2 12096 BuildingArea no trailing characters .33    '/Users/anne/ETC1010/M… row 3 12139 BuildingArea no trailing characters .23    '/Users/anne/ETC1010/M… col 4 12223 BuildingArea no trailing characters .51    '/Users/anne/ETC1010/M… expected 5 12252 BuildingArea no trailing characters .3     '/Users/anne/ETC1010/M…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
room<-select(room,Price,Rooms,Type)

Type_h<-bind_shadow(room)%>%
  filter(Type=='h')%>%
  mutate(Rooms=ifelse(is.na(Rooms),mean(Rooms,na.rm=TRUE),Rooms),Price=ifelse(is.na(Price),mean(Price,na.rm=TRUE),Price))
Type_u<-bind_shadow(room)%>%
  filter(Type=='u')%>%
  mutate(Rooms=ifelse(is.na(Rooms),mean(Rooms,na.rm=TRUE),Rooms),Price=ifelse(is.na(Price),mean(Price,na.rm=TRUE),Price))
Type_t<-bind_shadow(room)%>%
  filter(Type=='t')%>%
  mutate(Rooms=ifelse(is.na(Rooms),mean(Rooms,na.rm=TRUE),Rooms),Price=ifelse(is.na(Price),mean(Price,na.rm=TRUE),Price))
Type_shadow<-bind_rows(Type_h,Type_u,Type_t)

missing values relationship with other variables- Rooms & Type

#missing values relationship with other variables- Rooms & Type
ggplot(Type_shadow,aes(x=Rooms,y=Price,colour=Price_NA))+geom_point(alpha=0.7)+facet_grid(Type~.)+scale_colour_brewer(palette='Dark2')+
  theme(aspect.ratio = 0.5)

#Delete the missing value and mutate the variables as numeric

library(labelled)
room <- room %>% 
  filter(!is.na(Rooms)) %>%
  filter(!is.na(Price)) %>% 
  select(Price,Rooms, Type) %>%
  mutate(Rooms = as.numeric(Rooms)) %>%
  mutate(Price = as.numeric(Price)) %>%
  filter(Price>0)

find the relationship between log10(price) and Rooms devided by house types

library(ggplot2)
ggplot(room,  aes(x = Rooms, y =log10(Price )))  + geom_point(alpha=0.1)+
  geom_smooth(method="lm") + theme(aspect.ratio=1)+ facet_wrap(~ Type, nrow = 2)

#build two models to identify house types and rooms which may influence the house price

room <- room %>%
  mutate(log_price = log10(Price)) %>%
  mutate(type = factor(Type, ordered=FALSE))
mod1 <- lm(log_price~Type+Rooms, data=room)
mod2 <- lm(log_price~Type*Rooms, data=room)
tidy(mod1)
##          term    estimate   std.error  statistic      p.value
## 1 (Intercept)  5.72878127 0.004862399 1178.17995 0.000000e+00
## 2       Typet -0.05044409 0.003748501  -13.45713 3.784551e-41
## 3       Typeu -0.14605052 0.003309374  -44.13237 0.000000e+00
## 4       Rooms  0.08953308 0.001406000   63.67928 0.000000e+00
tidy(mod2)
##          term    estimate   std.error   statistic       p.value
## 1 (Intercept)  5.77808632 0.005418032 1066.454768  0.000000e+00
## 2       Typet -0.18962632 0.015762260  -12.030402  2.986799e-33
## 3       Typeu -0.32464511 0.009514007  -34.122858 6.199943e-250
## 4       Rooms  0.07468772 0.001580268   47.262704  0.000000e+00
## 5 Typet:Rooms  0.04598135 0.005233746    8.785553  1.645564e-18
## 6 Typeu:Rooms  0.07900155 0.004020568   19.649351  2.286745e-85
confint_tidy(mod2)
##      conf.low   conf.high
## 1  5.76746670  5.78870594
## 2 -0.22052116 -0.15873149
## 3 -0.34329305 -0.30599717
## 4  0.07159031  0.07778512
## 5  0.03572294  0.05623976
## 6  0.07112103  0.08688207

augment the models and plot modules

room_mod1 <- augment(mod1, room)
room_mod2 <- augment(mod2, room)
ggplot(room, aes(x=Rooms, y=log_price,
                 colour=Type)) + 
  geom_point(alpha=0.1) + 
  scale_colour_brewer("Type", palette="Dark2") +
  facet_wrap(~Type, ncol=4) + 
  theme(legend.position="bottom") + 
  xlab("Rooms") + 
  ylab("log_price") +
  geom_line(data=room_mod1, aes(y=.fitted)) + 
  ggtitle("Model 1")

ggplot(room, aes(x=Rooms, y=log_price,
                 colour=Type)) + 
  geom_point(alpha=0.1) + 
  scale_colour_brewer("Type", palette="Dark2") +
  facet_wrap(~Type, ncol=4) + 
  theme(legend.position="bottom") + 
  xlab("Rooms") + 
  ylab("log_price") +
  geom_line(data=room_mod2, aes(y=.fitted)) + 
  ggtitle("Model 2")

p1 <- ggplot(room_mod1, aes(x=Rooms, y=.fitted,
                            colour=Type)) +
  geom_line() +
  scale_colour_brewer("Type", palette="Dark2") +
  theme(legend.position="bottom") + 
  xlab("Rooms") + 
  ylab("log_price") + ggtitle("Model 1")
p2 <- ggplot(room_mod2, aes(x=Rooms, y=.fitted,
                            colour=Type)) +
  geom_line() +
  scale_colour_brewer("Type", palette="Dark2") +
  theme(legend.position="bottom") + 
  xlab("Rooms") + 
  ylab("log_price") + ggtitle("Model 2")
grid.arrange(p1, p2, ncol=2)

#compared two models,the mod2 is better than mod1

glance(mod1)
##   r.squared adj.r.squared     sigma statistic p.value df   logLik
## 1 0.3268587     0.3267846 0.1842156  4409.481       0  4 7432.523
##         AIC       BIC deviance df.residual
## 1 -14855.05 -14813.98 924.5021       27243
glance(mod2)
##   r.squared adj.r.squared     sigma statistic p.value df   logLik
## 1 0.3372794     0.3371578 0.1827909  2772.761       0  6 7645.074
##         AIC       BIC deviance df.residual
## 1 -15276.15 -15218.66 910.1901       27241
glance(room_mod1)
##    nrow ncol complete.obs na.fraction
## 1 27247   12        27247           0
glance(room_mod2)
##    nrow ncol complete.obs na.fraction
## 1 27247   12        27247           0

explanatory variables

p1 <- ggplot(room_mod1, aes(x=Type, y=.fitted, colour=Rooms, group=Rooms)) +
  geom_line() +  scale_colour_viridis() + 
  theme(legend.position="none") + ggtitle("Model 1: no interaction")


p2 <- ggplot(room_mod2, aes(x=Type, y=.fitted, colour=Rooms, group=Rooms)) +
  geom_line() +  scale_colour_viridis() + 
  theme(legend.position="none") + ggtitle("Model 2: with an interaction") 

p3 <- ggplot(room_mod1, aes(x=Rooms, y=.fitted, colour=Type, group=Type)) +
  geom_line() +
  theme(legend.position="none") + ggtitle("Model 1: no interaction")
p4 <- ggplot(room_mod2, aes(x=Rooms, y=.fitted, colour=Type, group=Type)) +
  geom_line() + 
  theme(legend.position="none") + ggtitle("Model 2: with an interaction") 
grid.arrange(p1, p2, p3, p4, ncol=2)

# Team member: Xiaoru Chen

library(tidyverse)
library(modelr)
library(labelled)
library(broom)
ass2 <- read.csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
ass3 <- ass2 
#choose the collumn of variables
assm <- select(ass3, Price, Distance, Rooms)
#plot the graph of missing values
library(visdat)
vis_dat(assm)

#Delete the missing value of Variables Distance and Rooms and select the collumn of variables
ass4 <- filter(ass3, !is.na(Distance)) %>% filter(!is.na(Rooms)) 
ass5 <- filter(ass4, !is.na(Price)) 
ass6 <- select(ass5, Price, Distance, Rooms)  %>% mutate(Distance = to_factor(Distance, ordered=TRUE, drop_unused_labels = TRUE))
#Transfrom the variables into numberic type and create a new collumn with log10(Price) so the scale of price can bechanged into reasonable scale

ass7 <- mutate(ass6, Price = as.numeric(Price)) 
ass8 <- mutate(ass7, Distance = as.numeric(Distance)) 
ass9 <- mutate(ass7, Rooms = as.numeric(Rooms))
ass10 <- mutate(ass8, Price = log10(Price))
#Use ggplot to draw a graph with fitted line 
p1 <- ggplot(ass10, mapping = aes(x=Distance, y=Price))+geom_point()+geom_smooth(method="lm", se=FALSE)
ggplot(ass10, mapping = aes(x=Rooms, y=Price))+geom_point()+geom_smooth(method="lm", se=FALSE)

#Create a new model of using distance and rooms as independent variables and price as dependent variable
pisa_lm <- lm(Price~Distance+Rooms, data=ass10)
#create a model with interaction
pisa_lm1 <- lm(Price~Distance+Rooms-Distance*Rooms, data=ass10)
#Use tidy and glance funciton to get the estimated values of parameters and the r squared
tidy(pisa_lm)
##          term     estimate    std.error  statistic       p.value
## 1 (Intercept) 5.5313586106 4.312030e-03 1282.77355  0.000000e+00
## 2    Distance 0.0004338575 1.617518e-05   26.82243 1.908242e-156
## 3       Rooms 0.1276028495 1.201559e-03  106.19777  0.000000e+00
glance(pisa_lm)
##   r.squared adj.r.squared     sigma statistic p.value df   logLik
## 1 0.2970719     0.2970203 0.1882439  5756.938       0  3 6842.634
##         AIC       BIC deviance df.residual
## 1 -13677.27 -13644.42 965.4117       27244
tidy(pisa_lm1)
##          term estimate  std.error statistic p.value
## 1 (Intercept) 5.959711 0.00136016  4381.624       0
glance(pisa_lm1)
##   r.squared adj.r.squared     sigma statistic p.value df   logLik
## 1         0             0 0.2245171        NA      NA  1 2040.341
##         AIC       BIC deviance df.residual
## 1 -4076.683 -4060.257 1373.415       27246
#From the result we can see the model with interaction has r squared of 0, therefore we choose anther model with no interaction.

Team member: Anushka Fernando

library(naniar)
library(visdat)
library(broom)
library(tidyverse)
library(dplyr)
library(tidyr)
library(ggplot2)
Melbourne_housing_FULL <- read_csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   Suburb = col_character(),
##   Address = col_character(),
##   Type = col_character(),
##   Method = col_character(),
##   SellerG = col_character(),
##   Date = col_character(),
##   Distance = col_double(),
##   CouncilArea = col_character(),
##   Lattitude = col_double(),
##   Longtitude = col_double(),
##   Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col     row col          expected               actual file                    expected   <int> <chr>        <chr>                  <chr>  <chr>                   actual 1 12094 BuildingArea no trailing characters .3     '/Users/anne/ETC1010/M… file 2 12096 BuildingArea no trailing characters .33    '/Users/anne/ETC1010/M… row 3 12139 BuildingArea no trailing characters .23    '/Users/anne/ETC1010/M… col 4 12223 BuildingArea no trailing characters .51    '/Users/anne/ETC1010/M… expected 5 12252 BuildingArea no trailing characters .3     '/Users/anne/ETC1010/M…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
vis_miss(Melbourne_housing_FULL, sort_miss=TRUE) + theme(aspect.ratio=1)

ggplot(Melbourne_housing_FULL,
       aes(x = Bathroom,
                    y = Bedroom2)) +
  geom_point() + theme(aspect.ratio=1)
## Warning: Removed 8226 rows containing missing values (geom_point).

#Warning message:
  #Removed 3557 rows containing missing values (geom_point). 

ggplot(Melbourne_housing_FULL,
         aes(x = Bathroom,
                      y = Bedroom2)) +
  scale_colour_brewer(palette="Dark2") +
  geom_miss_point() + theme(aspect.ratio=1)

#21.7% missing values, 0.1240107, proportion of observations missing
#0.4285714 proportion of variables missing

#3557 rows contained missing values
#The plot showing missing values shows that the missing values are from the 0 values. 
houses_select <- Melbourne_housing_FULL %>%
  select(Bathroom, Price, Type, Rooms) %>%
  mutate(Type = factor(Type, ordered = FALSE)) %>% 
  filter(Bathroom>0) %>% mutate(Bathroom = as.numeric(Bathroom, ordered=FALSE)) %>% 
  mutate(Price = as.numeric(Price)) %>%
  filter(Price>0) %>%
  filter(is.na(Price))%>% mutate(Price_log = log10(Price))

Type_h <- bind_shadow(Melbourne_housing_FULL) %>%
  filter(Type=="h") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom),
              mean(Bathroom, na.rm = TRUE),
              Bathroom),
         Price = ifelse(is.na(Price),
                        mean(Price, na.rm = TRUE),
                        Price))
Type_u <- bind_shadow(Melbourne_housing_FULL) %>%
  filter(Type=="u") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom),
                           mean(Bathroom, na.rm=TRUE),
                           Bathroom),
         Price = ifelse(is.na(Price),
                        mean(Price, na.rm=TRUE),
                        Price))
Type_t <- bind_shadow(Melbourne_housing_FULL) %>%
  filter(Type=="t") %>%
  mutate(Bathroom = ifelse(is.na(Bathroom),
                           mean(Bathroom, na.rm = TRUE),
                           Bathroom),
         Price = ifelse(is.na(Price),
                        mean(Price, na.rm = TRUE),
                        Price))
Type_shadow <- bind_rows(Type_h, Type_u, Type_t)
#Plotted final missingness data, Bathroom and Price
ggplot(Type_shadow,
       aes(x = Bathroom,
           y = Price,
           colour=Price_NA)) + 
  geom_point(alpha=0.7) +
  facet_grid(Type~.) +
  scale_colour_brewer(palette = "Dark2") +
  theme(aspect.ratio = 0.5)

Inference

#Comparing the R square of the 3 models above, based on the R-squares 0.3372794 (log_price~Type*Rooms)which is higher than 0.2970719    (log10(price) vs distance*rooms) and 0.3026702 (log10(price) vs bedroom*rooms) , we can conclude that the second model which is (log_price~Type*Rooms) is the best model.
#We can be 95% confident that the mean increase in price is between 0.03572294 and 0.05623976   when the type is t and has 1 additional room compared to the type h and 95% confident of an increase in price between 0.07112103 and    0.08688207 when the type is u and has an additional rooms compared to type h.